1. We try to understand two following issues in more systematic ways.
  1. Some terminology used in this report.
  1. For the number of topics, we focus on \(K + 1 (=4)\).

  2. Description of data

Parameters Value
# Document \(1000\)
Length of documents Poisson( \(200\) )
\(\alpha\) (control topic) (0.953, 0.644, 1.091, 0.508, 1.216, 0.741)

When we look at the vector \(\alpha\), we learn that words in topic 2 appear less frequent then other topics. This could be the reason that estimation of topic 2 is sometimes instable in the following results.

Prepare the corpus

docs <- list.files(folder, pattern = "*.txt", full.names = TRUE)
explore_ <- explore(docs,
             remove_numbers = FALSE, # For simulation, make it false
             remove_punct = TRUE,
             remove_symbols = TRUE,
             remove_separators = TRUE)
explore_$data_tfidf %>%
    group_by(term) %>%
    summarize(TFIDFMedian = median(tf_idf)) %>%
    arrange(desc(TFIDFMedian)) -> ranked_tfidf

Mix seeds (keywords)

We include one or two seeds from other topic (topic 3) in the keyword set of topic 1.

Result 1: two strong seeds from topic 1 and one strong seed from topic 3

seed_list <- list(c("w1t1 w367t1 w80t3"),
                c("w223t2 w541t2 w536t2"),
                c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(contam1, n = 25, show_seed = T)
##       1            2            3            T_1     
##  [1,] "w1t1 [✓]"   "w223t2 [✓]" "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1 [✓]" "w541t2 [✓]" "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1"     "w536t2 [✓]" "w223t3 [✓]" "w173t6"
##  [4,] "w80t3 [✓]"  "w0t2"       "w8t3"       "w118t5"
##  [5,] "w72t1"      "w1245t2"    "w87t3"      "w19t5" 
##  [6,] "w75t1"      "w122t2"     "w77t3"      "w45t6" 
##  [7,] "w63t1"      "w164t2"     "w127t3"     "w651t6"
##  [8,] "w449t1"     "w619t2"     "w264t3"     "w185t6"
##  [9,] "w60t1"      "w23t2"      "w153t3"     "w55t6" 
## [10,] "w832t1"     "w725t2"     "w115t3"     "w58t5" 
## [11,] "w11t1"      "w623t2"     "w143t3"     "w37t6" 
## [12,] "w405t1"     "w943t2"     "w263t3"     "w209t5"
## [13,] "w1073t1"    "w979t2"     "w92t3"      "w107t5"
## [14,] "w833t1"     "w606t2"     "w867t3"     "w66t5" 
## [15,] "w113t1"     "w1501t2"    "w593t3"     "w129t5"
## [16,] "w273t1"     "w158t2"     "w949t3"     "w33t4" 
## [17,] "w404t1"     "w134t2"     "w222t3"     "w368t5"
## [18,] "w21t1"      "w724t2"     "w884t3"     "w269t5"
## [19,] "w403t1"     "w556t2"     "w953t3"     "w91t5" 
## [20,] "w808t1"     "w568t2"     "w196t3"     "w116t5"
## [21,] "w1214t1"    "w129t2"     "w548t3"     "w379t5"
## [22,] "w274t1"     "w886t2"     "w848t3"     "w160t6"
## [23,] "w391t1"     "w159t2"     "w80t3 [1]"  "w400t5"
## [24,] "w768t1"     "w629t2"     "w245t3"     "w686t5"
## [25,] "w994t1"     "w170t2"     "w103t3"     "w430t5"
diagnosis_topic_recovery_heatmap(contam1, 25)
lda4

diagnosis_model_fit(contam1, start=2)

Result 2: two strong seeds from topic 1 and one middle quality seed from topic 3

seed_list <- list(c("w1t1 w367t1 w213t3"),
                c("w223t2 w541t2 w536t2"),
                c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(contam2, show_seed = T)
##       1            2            3            T_1     
##  [1,] "w1t1 [✓]"   "w223t2 [✓]" "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1 [✓]" "w33t4"      "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1"     "w541t2 [✓]" "w223t3 [✓]" "w173t6"
##  [4,] "w72t1"      "w536t2 [✓]" "w80t3"      "w118t5"
##  [5,] "w75t1"      "w0t2"       "w8t3"       "w19t5" 
##  [6,] "w63t1"      "w1245t2"    "w87t3"      "w45t6" 
##  [7,] "w832t1"     "w83t4"      "w77t3"      "w651t6"
##  [8,] "w449t1"     "w122t2"     "w127t3"     "w185t6"
##  [9,] "w60t1"      "w164t2"     "w264t3"     "w55t6" 
## [10,] "w11t1"      "w619t2"     "w153t3"     "w58t5"
diagnosis_topic_recovery_heatmap(contam2, 25)
lda4

diagnosis_model_fit(contam2, start=2)

Result 3: two strong seeds from topic 1 and one weak seed from other topic 3

seed_list <- list(c("w1t1 w367t1 w206t3"),
        c("w223t2 w541t2 w536t2"),
        c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(contam3, show_seed = T)
##       1            2            3            T_1     
##  [1,] "w1t1 [✓]"   "w223t2 [✓]" "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1 [✓]" "w541t2 [✓]" "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1"     "w536t2 [✓]" "w223t3 [✓]" "w173t6"
##  [4,] "w33t4"      "w0t2"       "w80t3"      "w118t5"
##  [5,] "w72t1"      "w1245t2"    "w8t3"       "w19t5" 
##  [6,] "w75t1"      "w122t2"     "w87t3"      "w45t6" 
##  [7,] "w63t1"      "w164t2"     "w77t3"      "w651t6"
##  [8,] "w449t1"     "w619t2"     "w127t3"     "w185t6"
##  [9,] "w60t1"      "w23t2"      "w264t3"     "w55t6" 
## [10,] "w11t1"      "w725t2"     "w153t3"     "w58t5"
diagnosis_topic_recovery_heatmap(contam3, 25)
lda4

diagnosis_model_fit(contam3, start=2)

Result 4: one strong seed from topic 1 and two weak seed from other topic 3

seed_list <- list(c("w1t1 w213t3 w387t3"),
        c("w223t2 w541t2 w536t2"),
        c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(contam4, show_seed = T)
##       1          2            3            T_1     
##  [1,] "w1t1 [✓]" "w223t2 [✓]" "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1"   "w541t2 [✓]" "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1"   "w536t2 [✓]" "w223t3 [✓]" "w173t6"
##  [4,] "w72t1"    "w0t2"       "w80t3"      "w118t5"
##  [5,] "w75t1"    "w1245t2"    "w8t3"       "w19t5" 
##  [6,] "w63t1"    "w122t2"     "w87t3"      "w45t6" 
##  [7,] "w832t1"   "w164t2"     "w77t3"      "w651t6"
##  [8,] "w449t1"   "w619t2"     "w127t3"     "w185t6"
##  [9,] "w60t1"    "w23t2"      "w264t3"     "w55t6" 
## [10,] "w11t1"    "w725t2"     "w153t3"     "w58t5"
diagnosis_topic_recovery_heatmap(contam4, 25)
lda4

diagnosis_model_fit(contam4, start=2)

Result 5: one strong seed from topic 1 and two weak seeds from other topic 3

seed_list <- list(c("w1t1 w1492t3 w206t3"),
        c("w223t2 w541t2 w536t2"),
        c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(contam5, show_seed = T)
##       1          2            3            T_1     
##  [1,] "w1t1 [✓]" "w223t2 [✓]" "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1"   "w33t4"      "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1"   "w541t2 [✓]" "w223t3 [✓]" "w173t6"
##  [4,] "w72t1"    "w536t2 [✓]" "w80t3"      "w118t5"
##  [5,] "w75t1"    "w0t2"       "w8t3"       "w19t5" 
##  [6,] "w63t1"    "w1245t2"    "w87t3"      "w45t6" 
##  [7,] "w449t1"   "w83t4"      "w77t3"      "w651t6"
##  [8,] "w832t1"   "w122t2"     "w127t3"     "w55t6" 
##  [9,] "w60t1"    "w164t2"     "w264t3"     "w185t6"
## [10,] "w11t1"    "w619t2"     "w153t3"     "w58t5"
diagnosis_topic_recovery_heatmap(contam5, 25)
lda4

diagnosis_model_fit(contam5, start=2)

Result 6: two middle quality seeds from topic 1 and one middle quality seed from other topic 3

seed_list <- list(c("w1281t1 w2461t1 w213t3"),
        c("w223t2 w541t2 w536t2"),
        c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(contam6, show_seed = T)
##       1        2            3            T_1     
##  [1,] "w33t5"  "w223t2 [✓]" "w26t3 [✓]"  "w173t6"
##  [2,] "w65t5"  "w541t2 [✓]" "w6t3 [✓]"   "w45t6" 
##  [3,] "w118t5" "w536t2 [✓]" "w223t3 [✓]" "w651t6"
##  [4,] "w1t1"   "w0t2"       "w80t3"      "w185t6"
##  [5,] "w19t5"  "w1245t2"    "w8t3"       "w55t6" 
##  [6,] "w367t1" "w122t2"     "w87t3"      "w37t6" 
##  [7,] "w180t1" "w164t2"     "w77t3"      "w33t4" 
##  [8,] "w58t5"  "w619t2"     "w127t3"     "w160t6"
##  [9,] "w209t5" "w23t2"      "w264t3"     "w83t4" 
## [10,] "w107t5" "w725t2"     "w153t3"     "w50t6"
diagnosis_topic_recovery_heatmap(contam6, 25, topicvec = c(1,2,3,4))
lda4

diagnosis_model_fit(contam6, start=2)

Result 7: two middle quality seeds from topic 1 and one weak seed from other topic 3

seed_list <- list(c("w1281t1 w2461t1 w206t3"),
        c("w223t2 w541t2 w536t2"),
        c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(contam7, show_seed = T)
##       1        2            3            T_1     
##  [1,] "w33t5"  "w223t2 [✓]" "w26t3 [✓]"  "w173t6"
##  [2,] "w65t5"  "w541t2 [✓]" "w6t3 [✓]"   "w45t6" 
##  [3,] "w118t5" "w536t2 [✓]" "w223t3 [✓]" "w651t6"
##  [4,] "w1t1"   "w0t2"       "w80t3"      "w185t6"
##  [5,] "w19t5"  "w1245t2"    "w8t3"       "w55t6" 
##  [6,] "w367t1" "w122t2"     "w87t3"      "w37t6" 
##  [7,] "w180t1" "w164t2"     "w77t3"      "w33t4" 
##  [8,] "w209t5" "w619t2"     "w127t3"     "w160t6"
##  [9,] "w107t5" "w23t2"      "w264t3"     "w83t4" 
## [10,] "w58t5"  "w725t2"     "w153t3"     "w50t6"
diagnosis_topic_recovery_heatmap(contam7, 25, topicvec = c(1,2,3,4))
lda4

diagnosis_model_fit(contam7, start=2)

Mix of strong, medium quality, and weak seeds within one topic

seed_list <- list(c("w1t1 w367t1 w180t1"),
                c("w223t2 w541t2 w536t2"),
                c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

explore_$visualize_tfidf(seed_list)
## $density

## 
## $median

top_terms(tf1, show_seed = T)
##       1            2            3            T_1     
##  [1,] "w1t1 [✓]"   "w173t6"     "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1 [✓]" "w45t6"      "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1 [✓]" "w651t6"     "w223t3 [✓]" "w118t5"
##  [4,] "w72t1"      "w185t6"     "w80t3"      "w19t5" 
##  [5,] "w75t1"      "w223t2 [✓]" "w8t3"       "w58t5" 
##  [6,] "w63t1"      "w55t6"      "w87t3"      "w209t5"
##  [7,] "w449t1"     "w37t6"      "w77t3"      "w107t5"
##  [8,] "w832t1"     "w160t6"     "w127t3"     "w66t5" 
##  [9,] "w60t1"      "w541t2 [✓]" "w264t3"     "w129t5"
## [10,] "w11t1"      "w536t2 [✓]" "w153t3"     "w368t5"
diagnosis_topic_recovery_heatmap(tf1, 25)
lda4

diagnosis_model_fit(tf1, start=2)

Result 1: two strong seeds and one middle quality seed

seed_list <- list(c("w1t1 w367t1 w1671t1"),
                c("w223t2 w541t2 w536t2"),
                c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(mix1, show_seed = T)
##       1            2            3            T_1     
##  [1,] "w1t1 [✓]"   "w173t6"     "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1 [✓]" "w45t6"      "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1"     "w651t6"     "w223t3 [✓]" "w118t5"
##  [4,] "w72t1"      "w185t6"     "w80t3"      "w19t5" 
##  [5,] "w75t1"      "w223t2 [✓]" "w8t3"       "w58t5" 
##  [6,] "w63t1"      "w55t6"      "w87t3"      "w209t5"
##  [7,] "w832t1"     "w37t6"      "w77t3"      "w107t5"
##  [8,] "w449t1"     "w160t6"     "w127t3"     "w66t5" 
##  [9,] "w60t1"      "w541t2 [✓]" "w264t3"     "w129t5"
## [10,] "w11t1"      "w536t2 [✓]" "w153t3"     "w33t4"
diagnosis_topic_recovery_heatmap(mix1, 25)
lda4

diagnosis_model_fit(mix1, start=2)

Result 2: two strong seeds one weak seed

seed_list <- list(c("w1t1 w367t1 w2532t1"),
                c("w223t2 w541t2 w536t2"),
                c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(mix2, show_seed = T)
##       1            2            3            T_1     
##  [1,] "w1t1 [✓]"   "w223t2 [✓]" "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1 [✓]" "w541t2 [✓]" "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1"     "w536t2 [✓]" "w223t3 [✓]" "w173t6"
##  [4,] "w72t1"      "w0t2"       "w80t3"      "w118t5"
##  [5,] "w75t1"      "w1245t2"    "w8t3"       "w19t5" 
##  [6,] "w63t1"      "w122t2"     "w87t3"      "w45t6" 
##  [7,] "w832t1"     "w164t2"     "w77t3"      "w185t6"
##  [8,] "w449t1"     "w619t2"     "w127t3"     "w651t6"
##  [9,] "w60t1"      "w23t2"      "w264t3"     "w55t6" 
## [10,] "w11t1"      "w725t2"     "w153t3"     "w58t5"
diagnosis_topic_recovery_heatmap(mix2, 25)
lda4

diagnosis_model_fit(mix2, start=2)

Result 3: one strong seed and two middle quality seeds

seed_list <- list(c("w1t1 w1185t1 w1671t1"),
        c("w223t2 w541t2 w536t2"),
        c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(mix3, show_seed = T)
##       1          2            3            T_1     
##  [1,] "w1t1 [✓]" "w223t2 [✓]" "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1"   "w541t2 [✓]" "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1"   "w536t2 [✓]" "w223t3 [✓]" "w173t6"
##  [4,] "w72t1"    "w0t2"       "w80t3"      "w118t5"
##  [5,] "w75t1"    "w1245t2"    "w8t3"       "w19t5" 
##  [6,] "w63t1"    "w122t2"     "w87t3"      "w45t6" 
##  [7,] "w449t1"   "w164t2"     "w77t3"      "w651t6"
##  [8,] "w832t1"   "w619t2"     "w127t3"     "w185t6"
##  [9,] "w11t1"    "w23t2"      "w264t3"     "w55t6" 
## [10,] "w405t1"   "w725t2"     "w153t3"     "w58t5"
diagnosis_topic_recovery_heatmap(mix3, 25)
lda4

diagnosis_model_fit(mix3, start=2)

Result 4: one strong seed and two weak seeds

seed_list <- list(c("w1t1 w2461t1 w2532t1"),
        c("w223t2 w541t2 w536t2"),
        c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(mix4, show_seed = T)
##       1          2            3            T_1     
##  [1,] "w1t1 [✓]" "w173t6"     "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1"   "w45t6"      "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1"   "w651t6"     "w223t3 [✓]" "w118t5"
##  [4,] "w72t1"    "w185t6"     "w80t3"      "w19t5" 
##  [5,] "w75t1"    "w223t2 [✓]" "w8t3"       "w58t5" 
##  [6,] "w63t1"    "w55t6"      "w87t3"      "w209t5"
##  [7,] "w832t1"   "w37t6"      "w77t3"      "w107t5"
##  [8,] "w449t1"   "w160t6"     "w127t3"     "w66t5" 
##  [9,] "w60t1"    "w541t2 [✓]" "w264t3"     "w129t5"
## [10,] "w11t1"    "w536t2 [✓]" "w153t3"     "w33t4"
diagnosis_topic_recovery_heatmap(mix4, 25)
lda4

diagnosis_model_fit(mix4, start=2)

Result 5: two middle quality seeds a weak seed

seed_list <- list(c("w1281t1 w2461t1 w2532t1"),
        c("w223t2 w541t2 w536t2"),
        c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(mix5, show_seed = T)
##       1        2            3            T_1     
##  [1,] "w1t1"   "w173t6"     "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1" "w45t6"      "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1" "w651t6"     "w223t3 [✓]" "w118t5"
##  [4,] "w33t4"  "w185t6"     "w80t3"      "w19t5" 
##  [5,] "w72t1"  "w223t2 [✓]" "w8t3"       "w58t5" 
##  [6,] "w75t1"  "w55t6"      "w87t3"      "w209t5"
##  [7,] "w63t1"  "w37t6"      "w77t3"      "w107t5"
##  [8,] "w832t1" "w541t2 [✓]" "w127t3"     "w66t5" 
##  [9,] "w449t1" "w160t6"     "w264t3"     "w129t5"
## [10,] "w60t1"  "w536t2 [✓]" "w153t3"     "w368t5"
diagnosis_topic_recovery_heatmap(mix5, 25)
lda4

diagnosis_model_fit(mix5, start=2)

Result 6: one middle quality seed and two week seed

seed_list <- list(c("w1281t1 w1185t1 w2532t1"),
        c("w223t2 w541t2 w536t2"),
        c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(mix6, show_seed = T)
##       1        2            3            T_1     
##  [1,] "w173t6" "w223t2 [✓]" "w26t3 [✓]"  "w33t5" 
##  [2,] "w1t1"   "w541t2 [✓]" "w6t3 [✓]"   "w65t5" 
##  [3,] "w45t6"  "w536t2 [✓]" "w223t3 [✓]" "w118t5"
##  [4,] "w367t1" "w0t2"       "w80t3"      "w19t5" 
##  [5,] "w651t6" "w1245t2"    "w8t3"       "w58t5" 
##  [6,] "w185t6" "w122t2"     "w87t3"      "w209t5"
##  [7,] "w180t1" "w164t2"     "w77t3"      "w107t5"
##  [8,] "w37t6"  "w619t2"     "w127t3"     "w66t5" 
##  [9,] "w55t6"  "w23t2"      "w264t3"     "w129t5"
## [10,] "w72t1"  "w725t2"     "w153t3"     "w33t4"
diagnosis_topic_recovery_heatmap(mix6, 25, topicvec = c(1,2,3,4))
lda4

diagnosis_model_fit(mix6, start=2)

Result 7: one strong, one middle quality, one weak seeds

seed_list <- list(c("w1t1 w1185t1 w2532t1"),
        c("w223t2 w541t2 w536t2"),
        c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

top_terms(mix7, show_seed = T)
##       1          2            3            T_1     
##  [1,] "w1t1 [✓]" "w173t6"     "w26t3 [✓]"  "w33t5" 
##  [2,] "w367t1"   "w45t6"      "w6t3 [✓]"   "w65t5" 
##  [3,] "w180t1"   "w651t6"     "w223t3 [✓]" "w118t5"
##  [4,] "w72t1"    "w185t6"     "w80t3"      "w19t5" 
##  [5,] "w75t1"    "w223t2 [✓]" "w8t3"       "w58t5" 
##  [6,] "w63t1"    "w55t6"      "w87t3"      "w209t5"
##  [7,] "w832t1"   "w37t6"      "w77t3"      "w107t5"
##  [8,] "w449t1"   "w33t4"      "w127t3"     "w66t5" 
##  [9,] "w60t1"    "w541t2 [✓]" "w264t3"     "w129t5"
## [10,] "w11t1"    "w160t6"     "w153t3"     "w368t5"
diagnosis_topic_recovery_heatmap(mix7, 25, topicvec = c(1,2,3,4))
lda4

diagnosis_model_fit(mix7, start=2)

Result 8: three week seeds

seed_list <- list(c("w2132t1 w2461t1 w2532t1"),
          c("w223t2 w541t2 w536t2"),
          c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)

explore_$visualize_tfidf(seed_list)
## $density

## 
## $median

top_terms(tf_sp, show_seed = T)
##       1        2            3            T_1     
##  [1,] "w33t5"  "w223t2 [✓]" "w26t3 [✓]"  "w173t6"
##  [2,] "w65t5"  "w541t2 [✓]" "w6t3 [✓]"   "w45t6" 
##  [3,] "w118t5" "w536t2 [✓]" "w223t3 [✓]" "w651t6"
##  [4,] "w1t1"   "w0t2"       "w80t3"      "w185t6"
##  [5,] "w19t5"  "w1245t2"    "w8t3"       "w55t6" 
##  [6,] "w367t1" "w122t2"     "w87t3"      "w37t6" 
##  [7,] "w180t1" "w164t2"     "w77t3"      "w33t4" 
##  [8,] "w58t5"  "w619t2"     "w127t3"     "w160t6"
##  [9,] "w209t5" "w23t2"      "w264t3"     "w83t4" 
## [10,] "w107t5" "w725t2"     "w153t3"     "w50t6"
diagnosis_topic_recovery_heatmap(tf_sp, 25, topicvec = c(1,2,3,4))
lda4

diagnosis_model_fit(tf_sp, start=2)